Origional Data¶
In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
In [3]:
# Import dataset
df = pd.read_csv('Death.csv')
In [4]:
# Shape
df.shape
Out[4]:
(137700, 16)
In [5]:
# Columns
df.columns
Out[5]:
Index(['Data As Of', 'Start Date', 'End Date', 'Group', 'Year', 'Month',
'State', 'Sex', 'Age Group', 'COVID-19 Deaths', 'Total Deaths',
'Pneumonia Deaths', 'Pneumonia and COVID-19 Deaths', 'Influenza Deaths',
'Pneumonia, Influenza, or COVID-19 Deaths', 'Footnote'],
dtype='object')
In [6]:
# Head
df.head()
Out[6]:
| Data As Of | Start Date | End Date | Group | Year | Month | State | Sex | Age Group | COVID-19 Deaths | Total Deaths | Pneumonia Deaths | Pneumonia and COVID-19 Deaths | Influenza Deaths | Pneumonia, Influenza, or COVID-19 Deaths | Footnote | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 09/27/2023 | 01/01/2020 | 09/23/2023 | By Total | NaN | NaN | United States | All Sexes | All Ages | 1146774.0 | 12303399.0 | 1162844.0 | 569264.0 | 22229.0 | 1760095.0 | NaN |
| 1 | 09/27/2023 | 01/01/2020 | 09/23/2023 | By Total | NaN | NaN | United States | All Sexes | Under 1 year | 519.0 | 73213.0 | 1056.0 | 95.0 | 64.0 | 1541.0 | NaN |
| 2 | 09/27/2023 | 01/01/2020 | 09/23/2023 | By Total | NaN | NaN | United States | All Sexes | 0-17 years | 1696.0 | 130970.0 | 2961.0 | 424.0 | 509.0 | 4716.0 | NaN |
| 3 | 09/27/2023 | 01/01/2020 | 09/23/2023 | By Total | NaN | NaN | United States | All Sexes | 1-4 years | 285.0 | 14299.0 | 692.0 | 66.0 | 177.0 | 1079.0 | NaN |
| 4 | 09/27/2023 | 01/01/2020 | 09/23/2023 | By Total | NaN | NaN | United States | All Sexes | 5-14 years | 509.0 | 22008.0 | 818.0 | 143.0 | 219.0 | 1390.0 | NaN |
In [7]:
# Information
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 137700 entries, 0 to 137699 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Data As Of 137700 non-null object 1 Start Date 137700 non-null object 2 End Date 137700 non-null object 3 Group 137700 non-null object 4 Year 134946 non-null float64 5 Month 123930 non-null float64 6 State 137700 non-null object 7 Sex 137700 non-null object 8 Age Group 137700 non-null object 9 COVID-19 Deaths 98270 non-null float64 10 Total Deaths 118191 non-null float64 11 Pneumonia Deaths 92836 non-null float64 12 Pneumonia and COVID-19 Deaths 100816 non-null float64 13 Influenza Deaths 111012 non-null float64 14 Pneumonia, Influenza, or COVID-19 Deaths 93467 non-null float64 15 Footnote 97896 non-null object dtypes: float64(8), object(8) memory usage: 16.8+ MB
In [8]:
# Find all unique value for object columns
for column in df.columns:
if df[column].dtype == object:
print(f"Unique values for {column}: {df[column].unique()}")
Unique values for Data As Of: ['09/27/2023'] Unique values for Start Date: ['01/01/2020' '01/01/2021' '01/01/2022' '01/01/2023' '02/01/2020' '03/01/2020' '04/01/2020' '05/01/2020' '06/01/2020' '07/01/2020' '08/01/2020' '09/01/2020' '10/01/2020' '11/01/2020' '12/01/2020' '02/01/2021' '03/01/2021' '04/01/2021' '05/01/2021' '06/01/2021' '07/01/2021' '08/01/2021' '09/01/2021' '10/01/2021' '11/01/2021' '12/01/2021' '02/01/2022' '03/01/2022' '04/01/2022' '05/01/2022' '06/01/2022' '07/01/2022' '08/01/2022' '09/01/2022' '10/01/2022' '11/01/2022' '12/01/2022' '02/01/2023' '03/01/2023' '04/01/2023' '05/01/2023' '06/01/2023' '07/01/2023' '08/01/2023' '09/01/2023'] Unique values for End Date: ['09/23/2023' '12/31/2020' '12/31/2021' '12/31/2022' '01/31/2020' '02/29/2020' '03/31/2020' '04/30/2020' '05/31/2020' '06/30/2020' '07/31/2020' '08/31/2020' '09/30/2020' '10/31/2020' '11/30/2020' '01/31/2021' '02/28/2021' '03/31/2021' '04/30/2021' '05/31/2021' '06/30/2021' '07/31/2021' '08/31/2021' '09/30/2021' '10/31/2021' '11/30/2021' '01/31/2022' '02/28/2022' '03/31/2022' '04/30/2022' '05/31/2022' '06/30/2022' '07/31/2022' '08/31/2022' '09/30/2022' '10/31/2022' '11/30/2022' '01/31/2023' '02/28/2023' '03/31/2023' '04/30/2023' '05/31/2023' '06/30/2023' '07/31/2023' '08/31/2023'] Unique values for Group: ['By Total' 'By Year' 'By Month'] Unique values for State: ['United States' 'Alabama' 'Alaska' 'Arizona' 'Arkansas' 'California' 'Colorado' 'Connecticut' 'Delaware' 'District of Columbia' 'Florida' 'Georgia' 'Hawaii' 'Idaho' 'Illinois' 'Indiana' 'Iowa' 'Kansas' 'Kentucky' 'Louisiana' 'Maine' 'Maryland' 'Massachusetts' 'Michigan' 'Minnesota' 'Mississippi' 'Missouri' 'Montana' 'Nebraska' 'Nevada' 'New Hampshire' 'New Jersey' 'New Mexico' 'New York' 'New York City' 'North Carolina' 'North Dakota' 'Ohio' 'Oklahoma' 'Oregon' 'Pennsylvania' 'Rhode Island' 'South Carolina' 'South Dakota' 'Tennessee' 'Texas' 'Utah' 'Vermont' 'Virginia' 'Washington' 'West Virginia' 'Wisconsin' 'Wyoming' 'Puerto Rico'] Unique values for Sex: ['All Sexes' 'Male' 'Female'] Unique values for Age Group: ['All Ages' 'Under 1 year' '0-17 years' '1-4 years' '5-14 years' '15-24 years' '18-29 years' '25-34 years' '30-39 years' '35-44 years' '40-49 years' '45-54 years' '50-64 years' '55-64 years' '65-74 years' '75-84 years' '85 years and over'] Unique values for Footnote: [nan 'One or more data cells have counts between 1-9 and have been suppressed in accordance with NCHS confidentiality standards.']
In [9]:
# Checking for missing data
df.isna().sum()
Out[9]:
Data As Of 0 Start Date 0 End Date 0 Group 0 Year 2754 Month 13770 State 0 Sex 0 Age Group 0 COVID-19 Deaths 39430 Total Deaths 19509 Pneumonia Deaths 44864 Pneumonia and COVID-19 Deaths 36884 Influenza Deaths 26688 Pneumonia, Influenza, or COVID-19 Deaths 44233 Footnote 39804 dtype: int64
In [10]:
# Drop useless columns
columns_to_drop = ["Data As Of", "Start Date", "End Date", "Footnote"]
df = df.drop(columns=columns_to_drop, errors='ignore')
In [11]:
df.head()
Out[11]:
| Group | Year | Month | State | Sex | Age Group | COVID-19 Deaths | Total Deaths | Pneumonia Deaths | Pneumonia and COVID-19 Deaths | Influenza Deaths | Pneumonia, Influenza, or COVID-19 Deaths | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | By Total | NaN | NaN | United States | All Sexes | All Ages | 1146774.0 | 12303399.0 | 1162844.0 | 569264.0 | 22229.0 | 1760095.0 |
| 1 | By Total | NaN | NaN | United States | All Sexes | Under 1 year | 519.0 | 73213.0 | 1056.0 | 95.0 | 64.0 | 1541.0 |
| 2 | By Total | NaN | NaN | United States | All Sexes | 0-17 years | 1696.0 | 130970.0 | 2961.0 | 424.0 | 509.0 | 4716.0 |
| 3 | By Total | NaN | NaN | United States | All Sexes | 1-4 years | 285.0 | 14299.0 | 692.0 | 66.0 | 177.0 | 1079.0 |
| 4 | By Total | NaN | NaN | United States | All Sexes | 5-14 years | 509.0 | 22008.0 | 818.0 | 143.0 | 219.0 | 1390.0 |
In [12]:
# For all coulmns that "Group" = "By Total", create a new df called df1
df1 = df[df['Group'] == 'By Total']
In [13]:
# Drop "Year", "Month" and "Group" in df1
df1 = df1.drop(['Year', 'Month', 'Group'], axis=1)
In [14]:
df1.head()
Out[14]:
| State | Sex | Age Group | COVID-19 Deaths | Total Deaths | Pneumonia Deaths | Pneumonia and COVID-19 Deaths | Influenza Deaths | Pneumonia, Influenza, or COVID-19 Deaths | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | United States | All Sexes | All Ages | 1146774.0 | 12303399.0 | 1162844.0 | 569264.0 | 22229.0 | 1760095.0 |
| 1 | United States | All Sexes | Under 1 year | 519.0 | 73213.0 | 1056.0 | 95.0 | 64.0 | 1541.0 |
| 2 | United States | All Sexes | 0-17 years | 1696.0 | 130970.0 | 2961.0 | 424.0 | 509.0 | 4716.0 |
| 3 | United States | All Sexes | 1-4 years | 285.0 | 14299.0 | 692.0 | 66.0 | 177.0 | 1079.0 |
| 4 | United States | All Sexes | 5-14 years | 509.0 | 22008.0 | 818.0 | 143.0 | 219.0 | 1390.0 |
In [15]:
# For all coulmns that "Group" = "By Year", create a new df called df2
df2 = df[df['Group'] == 'By Year']
In [16]:
# Drop "Month" and "Group" in df2
df2 = df2.drop(['Month', 'Group'], axis=1)
df2.head()
Out[16]:
| Year | State | Sex | Age Group | COVID-19 Deaths | Total Deaths | Pneumonia Deaths | Pneumonia and COVID-19 Deaths | Influenza Deaths | Pneumonia, Influenza, or COVID-19 Deaths | |
|---|---|---|---|---|---|---|---|---|---|---|
| 2754 | 2020.0 | United States | All Sexes | All Ages | 385666.0 | 3390039.0 | 352010.0 | 180086.0 | 8787.0 | 565226.0 |
| 2755 | 2020.0 | United States | All Sexes | Under 1 year | 52.0 | 19645.0 | 242.0 | 9.0 | 21.0 | 306.0 |
| 2756 | 2020.0 | United States | All Sexes | 0-17 years | 199.0 | 34204.0 | 629.0 | 36.0 | 179.0 | 971.0 |
| 2757 | 2020.0 | United States | All Sexes | 1-4 years | 25.0 | 3539.0 | 134.0 | 4.0 | 61.0 | 216.0 |
| 2758 | 2020.0 | United States | All Sexes | 5-14 years | 69.0 | 5644.0 | 173.0 | 12.0 | 76.0 | 306.0 |
In [17]:
# For all coulmns that "Group" = "By Month", create a new df called df3
df3 = df[df['Group'] == 'By Month']
In [18]:
# Drop "Group"
df3 = df3.drop('Group', axis=1)
In [19]:
df3.head()
Out[19]:
| Year | Month | State | Sex | Age Group | COVID-19 Deaths | Total Deaths | Pneumonia Deaths | Pneumonia and COVID-19 Deaths | Influenza Deaths | Pneumonia, Influenza, or COVID-19 Deaths | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 13770 | 2020.0 | 1.0 | United States | All Sexes | All Ages | 6.0 | 264677.0 | 17909.0 | 3.0 | 2125.0 | 20037.0 |
| 13771 | 2020.0 | 1.0 | United States | All Sexes | Under 1 year | 0.0 | 1784.0 | 41.0 | 0.0 | 8.0 | 49.0 |
| 13772 | 2020.0 | 1.0 | United States | All Sexes | 0-17 years | 0.0 | 2966.0 | 90.0 | 0.0 | 63.0 | 153.0 |
| 13773 | 2020.0 | 1.0 | United States | All Sexes | 1-4 years | 0.0 | 315.0 | 22.0 | 0.0 | 18.0 | 40.0 |
| 13774 | 2020.0 | 1.0 | United States | All Sexes | 5-14 years | 0.0 | 471.0 | 21.0 | 0.0 | 29.0 | 50.0 |
In [20]:
# Remove rows with null values in df1, df2, and df3
df1 = df1.dropna()
df2 = df2.dropna()
df3 = df3.dropna()
In [21]:
# Check for null values in df1, df2, and df3
print("Null values in df1:\n", df1.isnull().sum())
print("\nNull values in df2:\n", df2.isnull().sum())
print("\nNull values in df3:\n", df3.isnull().sum())
Null values in df1: State 0 Sex 0 Age Group 0 COVID-19 Deaths 0 Total Deaths 0 Pneumonia Deaths 0 Pneumonia and COVID-19 Deaths 0 Influenza Deaths 0 Pneumonia, Influenza, or COVID-19 Deaths 0 dtype: int64 Null values in df2: Year 0 State 0 Sex 0 Age Group 0 COVID-19 Deaths 0 Total Deaths 0 Pneumonia Deaths 0 Pneumonia and COVID-19 Deaths 0 Influenza Deaths 0 Pneumonia, Influenza, or COVID-19 Deaths 0 dtype: int64 Null values in df3: Year 0 Month 0 State 0 Sex 0 Age Group 0 COVID-19 Deaths 0 Total Deaths 0 Pneumonia Deaths 0 Pneumonia and COVID-19 Deaths 0 Influenza Deaths 0 Pneumonia, Influenza, or COVID-19 Deaths 0 dtype: int64
In [22]:
# Check duplicate data
duplicates_df1 = df1[df1.duplicated()]
duplicates_df2 = df2[df2.duplicated()]
duplicates_df3 = df3[df3.duplicated()]
print("Duplicate rows in df1:\n", duplicates_df1)
print("\nDuplicate rows in df2:\n", duplicates_df2)
print("\nDuplicate rows in df3:\n", duplicates_df3)
Duplicate rows in df1: Empty DataFrame Columns: [State, Sex, Age Group, COVID-19 Deaths, Total Deaths, Pneumonia Deaths, Pneumonia and COVID-19 Deaths, Influenza Deaths, Pneumonia, Influenza, or COVID-19 Deaths] Index: [] Duplicate rows in df2: Empty DataFrame Columns: [Year, State, Sex, Age Group, COVID-19 Deaths, Total Deaths, Pneumonia Deaths, Pneumonia and COVID-19 Deaths, Influenza Deaths, Pneumonia, Influenza, or COVID-19 Deaths] Index: [] Duplicate rows in df3: Empty DataFrame Columns: [Year, Month, State, Sex, Age Group, COVID-19 Deaths, Total Deaths, Pneumonia Deaths, Pneumonia and COVID-19 Deaths, Influenza Deaths, Pneumonia, Influenza, or COVID-19 Deaths] Index: []
By total¶
In [24]:
# Remove "United States" in "State"
df1 = df1[df1['State'] != 'United States']
In [25]:
# Rename in State
us_state_abbrev = {
'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR',
'California': 'CA', 'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE',
'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID',
'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS',
'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS',
'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV',
'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY',
'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK',
'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT',
'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV',
'Wisconsin': 'WI', 'Wyoming': 'WY'
}
df1['State'] = df1['State'].map(us_state_abbrev)
In [26]:
df1.head()
Out[26]:
| State | Sex | Age Group | COVID-19 Deaths | Total Deaths | Pneumonia Deaths | Pneumonia and COVID-19 Deaths | Influenza Deaths | Pneumonia, Influenza, or COVID-19 Deaths | |
|---|---|---|---|---|---|---|---|---|---|
| 51 | AL | All Sexes | All Ages | 21520.0 | 231602.0 | 17619.0 | 7411.0 | 356.0 | 32038.0 |
| 59 | AL | All Sexes | 30-39 years | 416.0 | 6827.0 | 319.0 | 147.0 | 13.0 | 599.0 |
| 60 | AL | All Sexes | 35-44 years | 670.0 | 8639.0 | 468.0 | 231.0 | 16.0 | 921.0 |
| 61 | AL | All Sexes | 40-49 years | 1053.0 | 11224.0 | 748.0 | 359.0 | 14.0 | 1455.0 |
| 62 | AL | All Sexes | 45-54 years | 1628.0 | 15413.0 | 1114.0 | 557.0 | 23.0 | 2206.0 |
In [27]:
# Remove "All Sexes" in "Sex" and "All Ages" in "Age Group"
df1 = df1[df1['Sex'] != 'All Sexes']
df1 = df1[df1['Age Group'] != 'All Ages']
In [28]:
# State With Highest Death
death_categories_top5 = ['COVID-19 Deaths', 'Total Deaths', 'Pneumonia Deaths', 'Pneumonia and COVID-19 Deaths', 'Influenza Deaths','Pneumonia, Influenza, or COVID-19 Deaths']
state_death_summary_top5 = df1.groupby('State')[death_categories_top5].sum().reset_index()
ranked_states = {}
for category in death_categories_top5 + ['Total Deaths']:
ranked_states[category] = state_death_summary_top5[['State', category]] \
.sort_values(by=category, ascending=False) \
.head(5)
for category, data in ranked_states.items():
print(f"Top 5 States for {category}:")
print(data)
print()
Top 5 States for COVID-19 Deaths: State COVID-19 Deaths 4 CA 139640.0 42 TX 137829.0 8 FL 100759.0 37 PA 61219.0 34 OH 58631.0 Top 5 States for Total Deaths: State Total Deaths 4 CA 1472026.0 42 TX 1191557.0 8 FL 1114690.0 37 PA 633413.0 34 OH 620127.0 Top 5 States for Pneumonia Deaths: State Pneumonia Deaths 4 CA 155609.0 42 TX 134735.0 8 FL 118012.0 34 OH 54056.0 37 PA 53160.0 Top 5 States for Pneumonia and COVID-19 Deaths: State Pneumonia and COVID-19 Deaths 4 CA 79958.0 42 TX 77169.0 8 FL 62463.0 34 OH 28950.0 37 PA 27002.0 Top 5 States for Influenza Deaths: State Influenza Deaths 4 CA 2316.0 42 TX 2079.0 8 FL 1568.0 37 PA 1125.0 34 OH 1082.0 Top 5 States for Pneumonia, Influenza, or COVID-19 Deaths: State Pneumonia, Influenza, or COVID-19 Deaths 4 CA 217494.0 42 TX 197226.0 8 FL 157738.0 37 PA 88401.0 34 OH 84764.0
In [29]:
# Draw map
import plotly.express as px
fig1 = px.choropleth(df1,
locations='State',
locationmode="USA-states",
color='Total Deaths',
scope="usa",
color_continuous_scale="reds",
title='COVID-19 Total Deaths by State')
fig2 = px.choropleth(df1,
locations='State',
locationmode="USA-states",
color='COVID-19 Deaths',
scope="usa",
color_continuous_scale="reds",
title='COVID-19 Deaths by State')
fig3 = px.choropleth(df1,
locations='State',
locationmode="USA-states",
color='Pneumonia Deaths',
scope="usa",
color_continuous_scale="reds",
title='Pneumonia Deaths by State')
fig4 = px.choropleth(df1,
locations='State',
locationmode="USA-states",
color='Pneumonia and COVID-19 Deaths',
scope="usa",
color_continuous_scale="reds",
title='Pneumonia and COVID-19 Deaths by State')
fig5 = px.choropleth(df1,
locations='State',
locationmode="USA-states",
color='Influenza Deaths',
scope="usa",
color_continuous_scale="reds",
title='Influenza Deaths by State')
fig6 = px.choropleth(df1,
locations='State',
locationmode="USA-states",
color='Pneumonia, Influenza, or COVID-19 Deaths',
scope="usa",
color_continuous_scale="reds",
title='Pneumonia, Influenza, or COVID-19 Deaths by State')
fig1.show()
fig2.show()
fig3.show()
fig4.show()
fig5.show()
fig6.show()
By Year¶
In [31]:
# Remove "All Sexes" in "Sex" and "All Ages" in "Age Group" in df2
df2 = df2[df2['Sex'] != 'All Sexes']
df2 = df2[df2['Age Group'] != 'All Ages']
In [32]:
# Death by Sex
death_categories = ['COVID-19 Deaths', 'Total Deaths', 'Pneumonia Deaths',
'Pneumonia and COVID-19 Deaths', 'Influenza Deaths',
'Pneumonia, Influenza, or COVID-19 Deaths']
sex_death_summary = df2.groupby('Sex')[death_categories].sum().reset_index()
ranked_sex = {}
for category in death_categories:
ranked_sex[category] = sex_death_summary[['Sex', category]] \
.sort_values(by=category, ascending=False)
for category, data in ranked_sex.items():
print(f"Ranking for {category} by sex:")
print(data)
print()
Ranking for COVID-19 Deaths by sex:
Sex COVID-19 Deaths
1 Male 1290612.0
0 Female 1013424.0
Ranking for Total Deaths by sex:
Sex Total Deaths
1 Male 13414684.0
0 Female 11471872.0
Ranking for Pneumonia Deaths by sex:
Sex Pneumonia Deaths
1 Male 1307286.0
0 Female 1024331.0
Ranking for Pneumonia and COVID-19 Deaths by sex:
Sex Pneumonia and COVID-19 Deaths
1 Male 667663.0
0 Female 479919.0
Ranking for Influenza Deaths by sex:
Sex Influenza Deaths
0 Female 25070.0
1 Male 24746.0
Ranking for Pneumonia, Influenza, or COVID-19 Deaths by sex:
Sex Pneumonia, Influenza, or COVID-19 Deaths
1 Male 1951649.0
0 Female 1580457.0
In [33]:
# Draw Column Histogram for Sex
df_sex_category = df2.groupby('Sex')[['COVID-19 Deaths', 'Pneumonia Deaths', 'Pneumonia and COVID-19 Deaths','Influenza Deaths', 'Pneumonia, Influenza, or COVID-19 Deaths']].sum()
categories = df_sex_category.columns
x = np.arange(len(categories))
width = 0.35
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(x - width/2, df_sex_category.loc['Male'], width, label='Male', color='blue')
ax.bar(x + width/2, df_sex_category.loc['Female'], width, label='Female', color='orange')
ax.set_xlabel('Death Category')
ax.set_ylabel('Total Death Counts')
ax.set_title('Total Deaths by Category and Sex')
ax.set_xticks(x)
ax.set_xticklabels(categories, rotation=45, ha='right')
ax.legend()
plt.tight_layout()
plt.show()
In [34]:
# Unique value in "Age Group"
print(df1['Age Group'].unique())
['45-54 years' '50-64 years' '55-64 years' '65-74 years' '75-84 years' '85 years and over' '5-14 years' '35-44 years' '40-49 years' '0-17 years' '18-29 years' '25-34 years' '30-39 years' '15-24 years' 'Under 1 year' '1-4 years']
In [35]:
# Death by Age
death_categories = ['COVID-19 Deaths', 'Total Deaths', 'Pneumonia Deaths',
'Pneumonia and COVID-19 Deaths', 'Influenza Deaths',
'Pneumonia, Influenza, or COVID-19 Deaths']
age_death_summary = df2.groupby('Age Group')[death_categories].sum().reset_index()
ranked_age = {}
for category in death_categories:
ranked_age[category] = age_death_summary[['Age Group', category]] \
.sort_values(by=category, ascending=False).head(5)
for category, data in ranked_age.items():
print(f"Ranking for {category} by age:")
print(data)
print()
Ranking for COVID-19 Deaths by age:
Age Group COVID-19 Deaths
14 85 years and over 539510.0
13 75-84 years 499796.0
12 65-74 years 417231.0
10 50-64 years 324039.0
11 55-64 years 248667.0
Ranking for Total Deaths by age:
Age Group Total Deaths
14 85 years and over 6045580.0
13 75-84 years 5157094.0
12 65-74 years 4135015.0
10 50-64 years 3324078.0
11 55-64 years 2538614.0
Ranking for Pneumonia Deaths by age:
Age Group Pneumonia Deaths
13 75-84 years 535288.0
14 85 years and over 521898.0
12 65-74 years 447444.0
10 50-64 years 326290.0
11 55-64 years 253520.0
Ranking for Pneumonia and COVID-19 Deaths by age:
Age Group Pneumonia and COVID-19 Deaths
13 75-84 years 249998.0
12 65-74 years 224917.0
14 85 years and over 223667.0
10 50-64 years 173917.0
11 55-64 years 133561.0
Ranking for Influenza Deaths by age:
Age Group Influenza Deaths
14 85 years and over 10039.0
13 75-84 years 10014.0
12 65-74 years 8513.0
10 50-64 years 7935.0
11 55-64 years 6065.0
Ranking for Pneumonia, Influenza, or COVID-19 Deaths by age:
Age Group Pneumonia, Influenza, or COVID-19 Deaths
14 85 years and over 846767.0
13 75-84 years 793993.0
12 65-74 years 647175.0
10 50-64 years 483345.0
11 55-64 years 373925.0
In [36]:
# Draw Column Histogram for Age
df_age_group = df2.groupby('Age Group')[['COVID-19 Deaths', 'Pneumonia Deaths',
'Pneumonia and COVID-19 Deaths', 'Influenza Deaths',
'Pneumonia, Influenza, or COVID-19 Deaths']].sum()
age_order = ['Under 1 year', '1-4 years', '5-14 years', '15-24 years', '18-29 years',
'25-34 years', '30-39 years', '35-44 years', '40-49 years',
'45-54 years', '50-64 years', '55-64 years', '65-74 years',
'75-84 years', '85 years and over']
df_age_group = df_age_group.reindex(age_order)
death_categories = ['COVID-19 Deaths', 'Pneumonia Deaths', 'Pneumonia and COVID-19 Deaths',
'Influenza Deaths', 'Pneumonia, Influenza, or COVID-19 Deaths']
colors = ['blue', 'orange', 'green', 'red', 'purple']
x = np.arange(len(age_order))
width = 0.15
plt.figure(figsize=(15, 8))
for i, (category, color) in enumerate(zip(death_categories, colors)):
plt.bar(x + i * width, df_age_group[category], width, label=category, color=color)
plt.title('Death Counts by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Death Counts')
plt.xticks(x + width * 2, age_order, rotation=45, ha='right')
plt.legend()
plt.grid(True, axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
By Month¶
In [38]:
# Unique value
unique_years_df3 = df3['Year'].unique()
unique_months_df3 = df3['Month'].unique()
print("Unique years in df3:", unique_years_df3)
print("Unique months in df3:", unique_months_df3)
Unique years in df3: [2020. 2021. 2022. 2023.] Unique months in df3: [ 1. 2. 3. 4. 5. 6. 7. 8. 9. 10. 11. 12.]
In [39]:
# Create Date
df3['Year'] = df3['Year'].astype(int)
df3['Month'] = df3['Month'].astype(int)
df3['Date'] = pd.to_datetime(df3['Year'].astype(str) + '-' + df3['Month'].astype(str), format='%Y-%m')
In [40]:
# Month with highest Death
death_categories_top6 = ['COVID-19 Deaths', 'Total Deaths', 'Pneumonia Deaths',
'Pneumonia and COVID-19 Deaths', 'Influenza Deaths',
'Pneumonia, Influenza, or COVID-19 Deaths']
ranked_dates = {}
for category in death_categories_top6:
ranked_dates[category] = df3[['Date', category]] \
.sort_values(by=category, ascending=False) \
.head(5)
for category, data in ranked_dates.items():
print(f"Top 5 Dates for {category}:")
print(data)
print()
Top 5 Dates for COVID-19 Deaths:
Date COVID-19 Deaths
14382 2021-01-01 105565.0
14331 2020-12-01 98174.0
14994 2022-01-01 84011.0
13923 2020-04-01 65550.0
14790 2021-09-01 63444.0
Top 5 Dates for Total Deaths:
Date Total Deaths
14382 2021-01-01 373641.0
14994 2022-01-01 370245.0
14331 2020-12-01 367203.0
13923 2020-04-01 322414.0
14943 2021-12-01 320036.0
Top 5 Dates for Pneumonia Deaths:
Date Pneumonia Deaths
14382 2021-01-01 69849.0
14331 2020-12-01 62916.0
14994 2022-01-01 59484.0
14790 2021-09-01 51085.0
13923 2020-04-01 46427.0
Top 5 Dates for Pneumonia and COVID-19 Deaths:
Date Pneumonia and COVID-19 Deaths
14382 2021-01-01 55416.0
14331 2020-12-01 48324.0
14994 2022-01-01 43699.0
14790 2021-09-01 38294.0
14399 2021-01-01 32387.0
Top 5 Dates for Influenza Deaths:
Date Influenza Deaths
15555 2022-12-01 4460.0
13872 2020-03-01 2437.0
15589 2022-12-01 2411.0
13821 2020-02-01 2373.0
15606 2023-01-01 2238.0
Top 5 Dates for Pneumonia, Influenza, or COVID-19 Deaths:
Date Pneumonia, Influenza, or COVID-19 Deaths
14382 2021-01-01 120079.0
14331 2020-12-01 112842.0
14994 2022-01-01 100272.0
13923 2020-04-01 84003.0
14790 2021-09-01 76294.0
In [41]:
# Total Death through Time
death_categories = ['COVID-19 Deaths', 'Pneumonia Deaths', 'Pneumonia and COVID-19 Deaths','Influenza Deaths', 'Pneumonia, Influenza, or COVID-19 Deaths']
df_grouped = df3.groupby(['Date', 'Sex'])['Total Deaths'].sum().reset_index()
plt.figure(figsize=(12, 6))
for sex, color in zip(['Male', 'Female'], ['blue', 'orange']):
df_subset = df_grouped[df_grouped['Sex'] == sex]
plt.plot(df_subset['Date'], df_subset['Total Deaths'], label=sex, color=color, linewidth=2)
plt.title('Total Deaths by Sex Over Time')
plt.xlabel('Date')
plt.ylabel('Total Deaths')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
In [42]:
# Death for five reason of death through time
death_categories = ['COVID-19 Deaths', 'Pneumonia Deaths', 'Pneumonia and COVID-19 Deaths','Influenza Deaths', 'Pneumonia, Influenza, or COVID-19 Deaths']
plt.figure(figsize=(15, 15))
for i, category in enumerate(death_categories, 1):
plt.subplot(len(death_categories), 1, i)
df_grouped = df3.groupby(['Date', 'Sex'])[category].sum().reset_index()
for sex, color in zip(['Male', 'Female'], ['blue', 'orange']):
df_subset = df_grouped[df_grouped['Sex'] == sex]
plt.plot(df_subset['Date'], df_subset[category], label=sex, color=color, linewidth=2)
plt.title(f'{category} by Sex Over Time')
plt.xlabel('Date')
plt.ylabel(category)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
In [43]:
df3.head()
Out[43]:
| Year | Month | State | Sex | Age Group | COVID-19 Deaths | Total Deaths | Pneumonia Deaths | Pneumonia and COVID-19 Deaths | Influenza Deaths | Pneumonia, Influenza, or COVID-19 Deaths | Date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 13770 | 2020 | 1 | United States | All Sexes | All Ages | 6.0 | 264677.0 | 17909.0 | 3.0 | 2125.0 | 20037.0 | 2020-01-01 |
| 13771 | 2020 | 1 | United States | All Sexes | Under 1 year | 0.0 | 1784.0 | 41.0 | 0.0 | 8.0 | 49.0 | 2020-01-01 |
| 13772 | 2020 | 1 | United States | All Sexes | 0-17 years | 0.0 | 2966.0 | 90.0 | 0.0 | 63.0 | 153.0 | 2020-01-01 |
| 13773 | 2020 | 1 | United States | All Sexes | 1-4 years | 0.0 | 315.0 | 22.0 | 0.0 | 18.0 | 40.0 | 2020-01-01 |
| 13774 | 2020 | 1 | United States | All Sexes | 5-14 years | 0.0 | 471.0 | 21.0 | 0.0 | 29.0 | 50.0 | 2020-01-01 |
In [44]:
# Remove Year and Month in df3
df3 = df3.drop(['Year', 'Month'], axis=1)
df3.head()
Out[44]:
| State | Sex | Age Group | COVID-19 Deaths | Total Deaths | Pneumonia Deaths | Pneumonia and COVID-19 Deaths | Influenza Deaths | Pneumonia, Influenza, or COVID-19 Deaths | Date | |
|---|---|---|---|---|---|---|---|---|---|---|
| 13770 | United States | All Sexes | All Ages | 6.0 | 264677.0 | 17909.0 | 3.0 | 2125.0 | 20037.0 | 2020-01-01 |
| 13771 | United States | All Sexes | Under 1 year | 0.0 | 1784.0 | 41.0 | 0.0 | 8.0 | 49.0 | 2020-01-01 |
| 13772 | United States | All Sexes | 0-17 years | 0.0 | 2966.0 | 90.0 | 0.0 | 63.0 | 153.0 | 2020-01-01 |
| 13773 | United States | All Sexes | 1-4 years | 0.0 | 315.0 | 22.0 | 0.0 | 18.0 | 40.0 | 2020-01-01 |
| 13774 | United States | All Sexes | 5-14 years | 0.0 | 471.0 | 21.0 | 0.0 | 29.0 | 50.0 | 2020-01-01 |
In [115]:
df4 = df3.copy()
df4 = df4[~((df4['State'] == 'United States') | (df4['Sex'] == 'All Sexes') | (df4['Age Group'] == 'All Ages'))]
df4.head()
Out[115]:
| State | Sex | Age Group | COVID-19 Deaths | Total Deaths | Pneumonia Deaths | Pneumonia and COVID-19 Deaths | Influenza Deaths | Pneumonia, Influenza, or COVID-19 Deaths | Date | |
|---|---|---|---|---|---|---|---|---|---|---|
| 16083 | Alabama | Male | Under 1 year | 0.0 | 17.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2020-01-01 |
| 16086 | Alabama | Male | 5-14 years | 0.0 | 11.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2020-01-01 |
| 16088 | Alabama | Male | 18-29 years | 0.0 | 53.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2020-01-01 |
| 16100 | Alabama | Female | Under 1 year | 0.0 | 14.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2020-01-01 |
| 16104 | Alabama | Female | 15-24 years | 0.0 | 13.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2020-01-01 |
In [117]:
# One hot encoding to all object columns in df3
df4 = pd.get_dummies(df4, columns=df3.select_dtypes(include=['object']).columns, dummy_na=False)
df4.head()
Out[117]:
| COVID-19 Deaths | Total Deaths | Pneumonia Deaths | Pneumonia and COVID-19 Deaths | Influenza Deaths | Pneumonia, Influenza, or COVID-19 Deaths | Date | State_Alabama | State_Alaska | State_Arizona | ... | Age Group_35-44 years | Age Group_40-49 years | Age Group_45-54 years | Age Group_5-14 years | Age Group_50-64 years | Age Group_55-64 years | Age Group_65-74 years | Age Group_75-84 years | Age Group_85 years and over | Age Group_Under 1 year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 16083 | 0.0 | 17.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2020-01-01 | True | False | False | ... | False | False | False | False | False | False | False | False | False | True |
| 16086 | 0.0 | 11.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2020-01-01 | True | False | False | ... | False | False | False | True | False | False | False | False | False | False |
| 16088 | 0.0 | 53.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2020-01-01 | True | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 16100 | 0.0 | 14.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2020-01-01 | True | False | False | ... | False | False | False | False | False | False | False | False | False | True |
| 16104 | 0.0 | 13.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2020-01-01 | True | False | False | ... | False | False | False | False | False | False | False | False | False | False |
5 rows × 78 columns
In [119]:
df4.dtypes
Out[119]:
COVID-19 Deaths float64
Total Deaths float64
Pneumonia Deaths float64
Pneumonia and COVID-19 Deaths float64
Influenza Deaths float64
...
Age Group_55-64 years bool
Age Group_65-74 years bool
Age Group_75-84 years bool
Age Group_85 years and over bool
Age Group_Under 1 year bool
Length: 78, dtype: object
In [121]:
df5 = df4.copy()
In [49]:
# Change bool to float
df4['Date'] = df4['Date'].astype('int64')
df4 = df4.astype(float)
print(df4.dtypes)
COVID-19 Deaths float64
Total Deaths float64
Pneumonia Deaths float64
Pneumonia and COVID-19 Deaths float64
Influenza Deaths float64
...
Age Group_55-64 years float64
Age Group_65-74 years float64
Age Group_75-84 years float64
Age Group_85 years and over float64
Age Group_Under 1 year float64
Length: 78, dtype: object
In [50]:
# Heatmap
corr_matrix = df4.corr()
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm')
plt.show()
In [51]:
# Corr_matrix with highest relation
corr_matrix = df4.corr()
max_corr_values = {}
for col in corr_matrix.columns:
max_corr_values[col] = corr_matrix[col].drop(col).max()
sorted_corr = pd.Series(max_corr_values).sort_values(ascending=False).head(20)
print(sorted_corr)
COVID-19 Deaths 0.972556 Pneumonia, Influenza, or COVID-19 Deaths 0.972556 Pneumonia Deaths 0.971805 Pneumonia and COVID-19 Deaths 0.966959 Total Deaths 0.863321 Age Group_85 years and over 0.378147 State_California 0.374744 Influenza Deaths 0.362005 Age Group_75-84 years 0.293873 Age Group_65-74 years 0.213524 State_Florida 0.178320 State_Texas 0.170845 Age Group_50-64 years 0.153834 State_New York City 0.126236 State_Pennsylvania 0.118171 State_Ohio 0.114080 Age Group_55-64 years 0.106572 State_Illinois 0.097415 State_Vermont 0.087898 Age Group_5-14 years 0.087898 dtype: float64
In [52]:
# Heatmap with high relation columns
corr_matrix = df4.corr()
max_corr_values = {}
for col in corr_matrix.columns:
max_corr_values[col] = corr_matrix[col].drop(col).max()
sorted_cols = pd.Series(max_corr_values).sort_values(ascending=False).head(20).index.tolist()
filtered_corr_matrix = corr_matrix.loc[sorted_cols, sorted_cols]
sns.heatmap(filtered_corr_matrix, annot=False, cmap='coolwarm', cbar=True)
plt.show()
Modeling¶
In [68]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error, r2_score, mean_absolute_error
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.metrics import roc_curve, auc, precision_recall_curve
In [70]:
#set x, y and split to train and test
X = df4.drop('COVID-19 Deaths', axis=1)
y = df4['COVID-19 Deaths']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [72]:
target_size = 100000
current_size = len(X)
num_to_generate = target_size - current_size
def generate_interpolated_samples(X, y, num_samples):
interpolated_X = []
interpolated_y = []
for _ in range(num_samples):
idx1, idx2 = np.random.choice(len(X), size=2, replace=False)
alpha = np.random.rand()
new_sample_X = alpha * X[idx1] + (1 - alpha) * X[idx2]
new_sample_y = alpha * y[idx1] + (1 - alpha) * y[idx2]
interpolated_X.append(new_sample_X)
interpolated_y.append(new_sample_y)
return np.array(interpolated_X), np.array(interpolated_y)
def add_random_noise(X, y, num_samples, noise_level=0.01):
noisy_X = []
noisy_y = []
for _ in range(num_samples):
idx = np.random.choice(len(X))
new_sample_X = X[idx] + np.random.normal(0, noise_level, X.shape[1])
new_sample_y = y[idx] + np.random.normal(0, noise_level)
noisy_X.append(new_sample_X)
noisy_y.append(new_sample_y)
return np.array(noisy_X), np.array(noisy_y)
num_interpolated = num_to_generate // 2
num_noisy = num_to_generate - num_interpolated
X_values = X.values
y_values = y.values
X_interpolated, y_interpolated = generate_interpolated_samples(X_values, y_values, num_interpolated)
X_noisy, y_noisy = add_random_noise(X_values, y_values, num_noisy)
X_augmented = np.vstack([X_values, X_interpolated, X_noisy])
y_augmented = np.hstack([y_values, y_interpolated, y_noisy])
X_augmented = pd.DataFrame(X_augmented, columns=X.columns)
y_augmented = pd.Series(y_augmented, name='target')
X_train, X_test, y_train, y_test = train_test_split(X_augmented, y_augmented, test_size=0.2, random_state=42)
In [73]:
#linear regression model
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
Mean Squared Error: 82.42049762225037 R-squared: 0.9891601928591927
In [76]:
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")
Mean Absolute Error: 4.715865285080694
In [78]:
residuals = y_test - y_pred
plt.figure(figsize=(8, 6))
plt.scatter(y_pred, residuals, alpha=0.6, color='purple')
plt.axhline(y=0, color='red', linestyle='--', linewidth=2)
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()
import seaborn as sns
plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True, color='orange', bins=30)
plt.axvline(x=0, color='red', linestyle='--', linewidth=2)
plt.xlabel('Residuals')
plt.title('Residual Distribution')
plt.show()
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.6, color='blue', label='Predicted vs. Actual')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2, label='Ideal Fit')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs. Predicted Values')
plt.legend()
plt.show()
In [82]:
from sklearn.model_selection import cross_val_score
cv_mse = -cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
cv_r2 = cross_val_score(model, X_train, y_train, scoring='r2', cv=5)
print(f"Cross-Validated MSE: {cv_mse.mean()}")
print(f"Cross-Validated R²: {cv_r2.mean()}")
Cross-Validated MSE: 99.77869593375976 Cross-Validated R²: 0.9867886919625475
In [84]:
#L1 and L2
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import cross_val_score
ridge = Ridge(alpha=1.0)
lasso = Lasso(alpha=0.01)
ridge_mse = -cross_val_score(ridge, X_train, y_train, scoring='neg_mean_squared_error', cv=5).mean()
lasso_mse = -cross_val_score(lasso, X_train, y_train, scoring='neg_mean_squared_error', cv=5).mean()
print(f"Cross-Validated MSE (Ridge): {ridge_mse}")
print(f"Cross-Validated MSE (Lasso): {lasso_mse}")
C:\Users\chaoh\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=6.1286e-38): result may not be accurate. C:\Users\chaoh\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=6.14396e-38): result may not be accurate. C:\Users\chaoh\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=6.01141e-38): result may not be accurate. C:\Users\chaoh\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=6.09296e-38): result may not be accurate. C:\Users\chaoh\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=6.19471e-38): result may not be accurate. C:\Users\chaoh\anaconda3\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:697: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 4.987e+04, tolerance: 4.807e+04 C:\Users\chaoh\anaconda3\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:697: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 5.264e+04, tolerance: 4.594e+04 C:\Users\chaoh\anaconda3\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:697: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 5.015e+04, tolerance: 4.896e+04 C:\Users\chaoh\anaconda3\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:697: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 5.196e+04, tolerance: 4.637e+04
Cross-Validated MSE (Ridge): 1.510271994860769 Cross-Validated MSE (Lasso): 1.5358253069445076
C:\Users\chaoh\anaconda3\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:697: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 4.934e+04, tolerance: 4.860e+04
In [85]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
In [86]:
# L2/Ridge regression
pipeline = Pipeline([
('scaler', StandardScaler()),
('ridge', Ridge())
])
param_grid = {'ridge__alpha': [0.1, 1, 10, 100, 1000]}
grid_search = GridSearchCV(pipeline, param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)
best_alpha = grid_search.best_params_['ridge__alpha']
best_cv_mse = -grid_search.best_score_
print(f"Best Alpha: {best_alpha}")
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error (Ridge): {mse}")
print(f"R-squared (Ridge): {r2}")
cv_mse = -cross_val_score(best_model, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
cv_r2 = cross_val_score(best_model, X_train, y_train, scoring='r2', cv=5)
print(f"Cross-Validated MSE (Ridge): {np.mean(cv_mse)}")
print(f"Cross-Validated R² (Ridge): {np.mean(cv_r2)}")
Best Alpha: 0.1 Mean Squared Error (Ridge): 1.520949127860919 R-squared (Ridge): 0.9997999672934207 Cross-Validated MSE (Ridge): 1.51030214941287 Cross-Validated R² (Ridge): 0.9997929112941932
In [6]:
from math import sqrt
print(f"RMSE (Ridge): {sqrt(1.520949127860919)}")
RMSE (Ridge): 1.233267662699756
In [87]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import learning_curve
import numpy as np
import matplotlib.pyplot as plt
ridge_model = Ridge(alpha=0.1, random_state=42)
train_sizes, train_scores, validation_scores = learning_curve(
estimator=ridge_model,
X=X_train,
y=y_train,
cv=5,
scoring='neg_mean_squared_error',
n_jobs=-1
)
train_scores_mean = -np.mean(train_scores, axis=1)
validation_scores_mean = -np.mean(validation_scores, axis=1)
plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_scores_mean, label="Training Error", color="blue", marker="o")
plt.plot(train_sizes, validation_scores_mean, label="Validation Error", color="green", marker="s")
plt.title("Learning Curve for Ridge Regression")
plt.xlabel("Training Set Size")
plt.ylabel("Mean Squared Error")
plt.legend()
plt.grid()
plt.show()
In [88]:
residuals = y_test - y_pred
plt.figure(figsize=(8, 6))
plt.scatter(y_pred, residuals, alpha=0.6, color='purple')
plt.axhline(y=0, color='red', linestyle='--', linewidth=2)
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()
plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True, color='orange', bins=30)
plt.axvline(x=0, color='red', linestyle='--', linewidth=2)
plt.xlabel('Residuals')
plt.title('Residual Distribution')
plt.show()
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.6, color='blue', label='Predicted vs. Actual')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2, label='Ideal Fit')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs. Predicted Values')
plt.legend()
plt.show()
In [89]:
# Random Forest model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
print(f"Mean Absolute Error: {mae}")
Mean Squared Error: 8.791593286627121 R-squared: 0.9988437442330884 Mean Absolute Error: 0.7599344260619
In [10]:
print(f"RMSE (RF): {sqrt(8.791593286627121)}")
RMSE (RF): 2.965062105020251
In [90]:
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print(f"Cross-validated MSE: {-cv_scores.mean()}")
Cross-validated MSE: 13.042910890148272
In [91]:
residuals = y_test - y_pred
plt.figure(figsize=(8, 6))
plt.scatter(y_pred, residuals, alpha=0.6, color='purple')
plt.axhline(y=0, color='red', linestyle='--', linewidth=2)
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()
import seaborn as sns
plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True, color='orange', bins=30)
plt.axvline(x=0, color='red', linestyle='--', linewidth=2)
plt.xlabel('Residuals')
plt.title('Residual Distribution')
plt.show()
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.6, color='blue', label='Predicted vs. Actual')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2, label='Ideal Fit')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs. Predicted Values')
plt.legend()
plt.show()
In [92]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from scipy.stats import randint, uniform
param_dist = {
'n_estimators': [100, 200, 300, 500],
'max_depth': [10, 20, 30, None],
'min_samples_split': randint(2, 15),
'min_samples_leaf': randint(1, 10),
'max_features': ['sqrt', 'log2']
}
rf_model = RandomForestRegressor(random_state=42)
random_search = RandomizedSearchCV(
estimator=rf_model,
param_distributions=param_dist,
n_iter=100,
cv=5,
scoring='neg_mean_squared_error',
n_jobs=-1,
verbose=2,
random_state=42
)
random_search.fit(X_train, y_train)
print(f"Best Parameters: {random_search.best_params_}")
print(f"Best Cross-Validated MSE: {-random_search.best_score_}")
best_rf_model = random_search.best_estimator_
y_pred = best_rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"Optimized Mean Squared Error: {mse}")
print(f"Optimized R-squared: {r2}")
print(f"Mean Absolute Error: {mae}")
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters: {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 9, 'n_estimators': 300}
Best Cross-Validated MSE: 123.18656446929113
Optimized Mean Squared Error: 100.84344694932958
Optimized R-squared: 0.9867372371208559
Mean Absolute Error: 2.850470181925766
In [93]:
residuals = y_test - y_pred
plt.figure(figsize=(8, 6))
plt.scatter(y_pred, residuals, alpha=0.6, color='purple')
plt.axhline(y=0, color='red', linestyle='--', linewidth=2)
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()
import seaborn as sns
plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True, color='orange', bins=30)
plt.axvline(x=0, color='red', linestyle='--', linewidth=2)
plt.xlabel('Residuals')
plt.title('Residual Distribution')
plt.show()
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.6, color='blue', label='Predicted vs. Actual')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2, label='Ideal Fit')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs. Predicted Values')
plt.legend()
plt.show()
In [94]:
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
import numpy as np
train_sizes, train_scores, test_scores = learning_curve(
estimator=best_rf_model,
X=X_train,
y=y_train,
cv=5,
scoring='neg_mean_squared_error',
n_jobs=-1,
)
train_scores_mean = -np.mean(train_scores, axis=1)
test_scores_mean = -np.mean(test_scores, axis=1)
plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_scores_mean, label='Training Error', color='blue', marker='o')
plt.plot(train_sizes, test_scores_mean, label='Validation Error', color='green', marker='s')
plt.xlabel('Training Set Size')
plt.ylabel('Mean Squared Error')
plt.title('Learning Curve for Random Forest')
plt.legend()
plt.grid(True)
plt.show()
In [95]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)
y_pred = gb_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
print(f"Mean Absolute Error: {mae}")
Mean Squared Error: 26.598299155986606 R-squared: 0.9965018358121808 Mean Absolute Error: 2.602388853480325
In [96]:
cv_scores = cross_val_score(gb_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print(f"Cross-validated MSE: {-cv_scores.mean()}")
Cross-validated MSE: 28.57541184512715
In [97]:
residuals = y_test - y_pred
plt.figure(figsize=(8, 6))
plt.scatter(y_pred, residuals, alpha=0.6, color='purple')
plt.axhline(y=0, color='red', linestyle='--', linewidth=2)
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()
import seaborn as sns
plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True, color='orange', bins=30)
plt.axvline(x=0, color='red', linestyle='--', linewidth=2)
plt.xlabel('Residuals')
plt.title('Residual Distribution')
plt.show()
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.6, color='blue', label='Predicted vs. Actual')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2, label='Ideal Fit')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs. Predicted Values')
plt.legend()
plt.show()
In [98]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
param_dist = {
'n_estimators': [400, 600, 800, 1000],
'max_depth': [3, 4, 5, 6],
'learning_rate': uniform(0.01, 0.03),
'subsample': uniform(0.7, 0.2),
'max_features': ['sqrt', 'log2', None],
'min_samples_split': [2, 4, 6],
'min_samples_leaf': [1, 2, 4]
}
gb_model = GradientBoostingRegressor(random_state=42)
random_search = RandomizedSearchCV(
estimator=gb_model,
param_distributions=param_dist,
n_iter=100,
scoring='neg_mean_squared_error',
cv=5,
n_jobs=-1,
verbose=1,
random_state=42
)
random_search.fit(X_train, y_train)
print(f"Best Parameters: {random_search.best_params_}")
print(f"Best Cross-Validated MSE: {-random_search.best_score_}")
best_gb_model = random_search.best_estimator_
y_pred = best_gb_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Optimized Mean Squared Error: {mse}")
print(f"Optimized R-squared: {r2}")
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters: {'learning_rate': 0.03461918427231866, 'max_depth': 6, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000, 'subsample': 0.8810701283912127}
Best Cross-Validated MSE: 6.6972207975294555
Optimized Mean Squared Error: 6.964972264887826
Optimized R-squared: 0.999083978400149
In [9]:
print(f"RMSE (GBM): {sqrt(6.6972207975294555)}")
RMSE (GBM): 2.587898915632034
In [99]:
residuals = y_test - y_pred
plt.figure(figsize=(8, 6))
plt.scatter(y_pred, residuals, alpha=0.6, color='purple')
plt.axhline(y=0, color='red', linestyle='--', linewidth=2)
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()
In [100]:
import seaborn as sns
plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True, color='orange', bins=30)
plt.axvline(x=0, color='red', linestyle='--', linewidth=2)
plt.xlabel('Residuals')
plt.title('Residual Distribution')
plt.show()
In [101]:
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.6, color='blue', label='Predicted vs. Actual')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2, label='Ideal Fit')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs. Predicted Values')
plt.legend()
plt.show()
In [102]:
from sklearn.model_selection import learning_curve
import numpy as np
train_sizes, train_scores, test_scores = learning_curve(
estimator=best_gb_model, X=X_train, y=y_train, cv=5, scoring='neg_mean_squared_error')
train_scores_mean = -np.mean(train_scores, axis=1)
test_scores_mean = -np.mean(test_scores, axis=1)
plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_scores_mean, label='Training Error', color='blue', marker='o')
plt.plot(train_sizes, test_scores_mean, label='Validation Error', color='green', marker='s')
plt.xlabel('Training Set Size')
plt.ylabel('Mean Squared Error')
plt.title('Learning Curve')
plt.legend()
plt.show()
In [111]:
# Deep Learning
In [150]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from sklearn.preprocessing import MinMaxScaler
df6 = df5.copy()
df7 = df5.copy()
df6['Date'] = pd.to_datetime(df6['Date'])
df6['Date'] = (df6['Date'] - df6['Date'].min()) / pd.Timedelta(days=1)
target_column = 'COVID-19 Deaths'
X = df6.drop(columns=['Date', target_column])
y = df6[target_column]
target_size = len(df6) * 5
current_size = len(X)
num_to_generate = target_size - current_size
def generate_interpolated_samples(X, y, num_samples):
interpolated_X = []
interpolated_y = []
for _ in range(num_samples):
idx1, idx2 = np.random.choice(len(X), size=2, replace=False)
alpha = np.random.rand()
new_sample_X = alpha * X.iloc[idx1] + (1 - alpha) * X.iloc[idx2]
new_sample_y = alpha * y.iloc[idx1] + (1 - alpha) * y.iloc[idx2]
interpolated_X.append(new_sample_X)
interpolated_y.append(new_sample_y)
return pd.DataFrame(interpolated_X, columns=X.columns), np.array(interpolated_y)
def add_random_noise(X, y, num_samples, noise_level=0.01):
noisy_X = []
noisy_y = []
for _ in range(num_samples):
idx = np.random.choice(len(X))
new_sample_X = X.iloc[idx] + np.random.normal(0, noise_level, len(X.columns))
new_sample_y = y.iloc[idx] + np.random.normal(0, noise_level)
noisy_X.append(new_sample_X)
noisy_y.append(new_sample_y)
return pd.DataFrame(noisy_X, columns=X.columns), np.array(noisy_y)
num_interpolated = num_to_generate // 2
num_noisy = num_to_generate - num_interpolated
X_interpolated, y_interpolated = generate_interpolated_samples(X, y, num_interpolated)
X_noisy, y_noisy = add_random_noise(X, y, num_noisy)
X_augmented = pd.concat([X, X_interpolated, X_noisy], ignore_index=True)
y_augmented = np.hstack([y, y_interpolated, y_noisy])
df_augmented = X_augmented.copy()
df_augmented[target_column] = y_augmented
df_augmented['Date'] = pd.concat([df6['Date'], df6['Date']], ignore_index=True)
In [206]:
scaler_X = MinMaxScaler()
scaled_data_X = scaler_X.fit_transform(df_augmented.drop(columns=['Date', target_column]))
X_train, y_train = scaled_data_X[:-int(0.2 * len(scaled_data_X))], df_augmented[target_column].iloc[:-int(0.2 * len(df_augmented))]
X_test, y_test = scaled_data_X[-int(0.2 * len(scaled_data_X)):], df_augmented[target_column].iloc[-int(0.2 * len(df_augmented)):]
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])
model = Sequential([
LSTM(64, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True),
Dropout(0.2),
LSTM(32, activation='relu'),
Dropout(0.2),
Dense(16, activation='relu'),
Dense(1)
])
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.fit(X_train, y_train, validation_split=0.2, epochs=30, batch_size=32)
y_pred = model.predict(X_test)
Epoch 1/30
C:\Users\chaoh\anaconda3\Lib\site-packages\keras\src\layers\rnn\rnn.py:204: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 8s 3ms/step - loss: 4284.4507 - mae: 28.7094 - val_loss: 603.2394 - val_mae: 13.0328 Epoch 2/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 6s 3ms/step - loss: 688.1735 - mae: 14.7350 - val_loss: 244.2824 - val_mae: 7.9310 Epoch 3/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 3ms/step - loss: 427.4215 - mae: 10.9793 - val_loss: 153.3202 - val_mae: 6.2393 Epoch 4/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 6s 3ms/step - loss: 350.9440 - mae: 9.3596 - val_loss: 151.7106 - val_mae: 6.3204 Epoch 5/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 6s 3ms/step - loss: 270.3113 - mae: 8.4317 - val_loss: 112.6511 - val_mae: 5.8913 Epoch 6/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 257.1239 - mae: 8.0056 - val_loss: 120.6163 - val_mae: 6.3342 Epoch 7/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 232.8642 - mae: 7.5797 - val_loss: 102.2034 - val_mae: 5.4179 Epoch 8/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 217.2368 - mae: 7.3020 - val_loss: 90.6770 - val_mae: 4.9463 Epoch 9/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 221.9447 - mae: 7.1101 - val_loss: 120.1041 - val_mae: 5.6942 Epoch 10/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - loss: 194.8808 - mae: 6.7943 - val_loss: 76.6548 - val_mae: 5.4320 Epoch 11/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 184.7319 - mae: 6.6637 - val_loss: 161.5324 - val_mae: 5.4128 Epoch 12/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 6s 3ms/step - loss: 197.6679 - mae: 6.5735 - val_loss: 66.7648 - val_mae: 5.2191 Epoch 13/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 6s 3ms/step - loss: 176.8647 - mae: 6.3461 - val_loss: 77.2285 - val_mae: 4.5846 Epoch 14/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 6s 3ms/step - loss: 182.1945 - mae: 6.2603 - val_loss: 69.7907 - val_mae: 4.4851 Epoch 15/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - loss: 154.3186 - mae: 6.0397 - val_loss: 62.3844 - val_mae: 4.3544 Epoch 16/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - loss: 167.0663 - mae: 6.0321 - val_loss: 54.4324 - val_mae: 4.1472 Epoch 17/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 6s 3ms/step - loss: 171.3098 - mae: 5.9462 - val_loss: 72.6816 - val_mae: 5.4798 Epoch 18/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 6s 3ms/step - loss: 147.4979 - mae: 5.7463 - val_loss: 64.2266 - val_mae: 4.3810 Epoch 19/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 3ms/step - loss: 166.7913 - mae: 5.7907 - val_loss: 86.1697 - val_mae: 4.7513 Epoch 20/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 3ms/step - loss: 169.6841 - mae: 5.8199 - val_loss: 250.4195 - val_mae: 8.9573 Epoch 21/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 6s 3ms/step - loss: 159.7842 - mae: 5.6700 - val_loss: 84.8684 - val_mae: 6.3130 Epoch 22/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 156.0296 - mae: 5.6346 - val_loss: 51.3129 - val_mae: 4.5952 Epoch 23/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 133.6155 - mae: 5.4115 - val_loss: 58.2049 - val_mae: 5.1100 Epoch 24/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 139.6471 - mae: 5.3920 - val_loss: 242.0278 - val_mae: 6.3070 Epoch 25/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 148.5286 - mae: 5.4414 - val_loss: 88.7733 - val_mae: 5.9347 Epoch 26/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 205.9672 - mae: 5.5292 - val_loss: 87.0075 - val_mae: 5.3966 Epoch 27/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 113.4149 - mae: 5.1389 - val_loss: 160.8645 - val_mae: 6.7774 Epoch 28/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 128.2009 - mae: 5.2293 - val_loss: 67.4726 - val_mae: 5.1023 Epoch 29/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 127.4477 - mae: 5.1347 - val_loss: 128.9587 - val_mae: 6.3164 Epoch 30/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 113.1685 - mae: 5.0757 - val_loss: 64.3566 - val_mae: 5.5248 611/611 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step
In [207]:
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
Mean Squared Error: 66.14676826523599 R-squared: 0.9927328175627637
In [208]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
def create_model():
model = Sequential([
LSTM(64, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True),
Dropout(0.2),
LSTM(32, activation='relu'),
Dropout(0.2),
Dense(16, activation='relu'),
Dense(1)
])
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
return model
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_mse = []
X = X_train
y = y_train
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
print(f"Training fold {fold + 1}...")
X_fold_train, X_fold_val = X[train_idx], X[val_idx]
y_fold_train, y_fold_val = y.iloc[train_idx], y.iloc[val_idx]
model = create_model()
model.fit(X_fold_train, y_fold_train, epochs=30, batch_size=32, verbose=0)
y_pred = model.predict(X_fold_val)
mse = mean_squared_error(y_fold_val, y_pred)
fold_mse.append(mse)
print(f"Fold {fold + 1} MSE: {mse}")
print(f"Average Cross-Validation MSE: {np.mean(fold_mse)}")
Training fold 1...
C:\Users\chaoh\anaconda3\Lib\site-packages\keras\src\layers\rnn\rnn.py:204: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
489/489 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step Fold 1 MSE: 58.864196662395436 Training fold 2...
C:\Users\chaoh\anaconda3\Lib\site-packages\keras\src\layers\rnn\rnn.py:204: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
489/489 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step Fold 2 MSE: 101.79963373143964 Training fold 3...
C:\Users\chaoh\anaconda3\Lib\site-packages\keras\src\layers\rnn\rnn.py:204: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
489/489 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step Fold 3 MSE: 109.49627370901372 Training fold 4...
C:\Users\chaoh\anaconda3\Lib\site-packages\keras\src\layers\rnn\rnn.py:204: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
489/489 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step Fold 4 MSE: 122.16994173663969 Training fold 5...
C:\Users\chaoh\anaconda3\Lib\site-packages\keras\src\layers\rnn\rnn.py:204: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
489/489 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step Fold 5 MSE: 111.92000552857714 Average Cross-Validation MSE: 100.85001027361311
In [212]:
y_pred_rescaled = y_pred.flatten()
num_test_rows = int(0.2 * len(df7))
y_pred_rescaled_trimmed = y_pred_rescaled[-num_test_rows:]
if len(y_pred_rescaled_trimmed) != num_test_rows:
raise ValueError(f"Trimmed predicted values' length ({len(y_pred_rescaled_trimmed)}) does not match the expected number of rows ({num_test_rows})")
target_column_index = df7.columns.get_loc(target_column)
df7.iloc[-num_test_rows:, target_column_index] = y_pred_rescaled_trimmed
In [214]:
df5_grouped = df5.groupby('Date')[target_column].mean()
df7_grouped = df7.groupby('Date')[target_column].mean()
plt.figure(figsize=(12, 6))
plt.plot(df5_grouped.index, df5_grouped.values, label='Original Data (df5)', color='blue', linewidth=2)
plt.plot(df7_grouped.index, df7_grouped.values, label='Modified Data (df7)', color='red', linestyle='--', linewidth=2)
plt.xlabel('Date')
plt.ylabel('COVID-19 Deaths')
plt.title('COVID-19 Deaths Comparison: Grouped by Date')
plt.legend()
plt.grid(True)
plt.show()
In [225]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from kerastuner.tuners import RandomSearch
scaler_X = MinMaxScaler()
scaled_data_X = scaler_X.fit_transform(df_augmented.drop(columns=['Date', target_column]))
X_train, y_train = scaled_data_X[:-int(0.2 * len(scaled_data_X))], df_augmented[target_column].iloc[:-int(0.2 * len(df_augmented))]
X_test, y_test = scaled_data_X[-int(0.2 * len(scaled_data_X)):], df_augmented[target_column].iloc[-int(0.2 * len(df_augmented)):]
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])
def build_model(hp):
model = Sequential([
LSTM(
units=hp.Int('units1', min_value=32, max_value=128, step=16),
activation='relu',
input_shape=(X_train.shape[1], X_train.shape[2]),
return_sequences=True
),
Dropout(hp.Float('dropout1', min_value=0.1, max_value=0.5, step=0.1)),
LSTM(
units=hp.Int('units2', min_value=16, max_value=64, step=16),
activation='relu'
),
Dropout(hp.Float('dropout2', min_value=0.1, max_value=0.5, step=0.1)),
Dense(16, activation='relu'),
Dense(1)
])
model.compile(
optimizer=hp.Choice('optimizer', ['adam', 'rmsprop']),
loss='mse',
metrics=['mae']
)
return model
tuner = RandomSearch(
build_model,
objective='val_mae',
max_trials=10,
executions_per_trial=1,
directory='tuner_logs',
project_name='lstm_tuning'
)
tuner.search(X_train, y_train, validation_split=0.2, epochs=10, batch_size=32)
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"Best hyperparameter:units1={best_hps.get('units1')}, units2={best_hps.get('units2')}, "
f"dropout1={best_hps.get('dropout1')}, dropout2={best_hps.get('dropout2')}, "
f"optimizer={best_hps.get('optimizer')}")
best_model = tuner.hypermodel.build(best_hps)
best_model.fit(X_train, y_train, validation_split=0.2, epochs=30, batch_size=32)
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"MSE:{mse}")
Reloading Tuner from tuner_logs\lstm_tuning\tuner0.json Best hyperparameter:units1=96, units2=64, dropout1=0.1, dropout2=0.1, optimizer=rmsprop Epoch 1/30
C:\Users\chaoh\anaconda3\Lib\site-packages\keras\src\layers\rnn\rnn.py:204: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 7s 2ms/step - loss: 4762.8560 - mae: 28.7164 - val_loss: 401.6913 - val_mae: 10.0315 Epoch 2/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 351.7172 - mae: 9.9824 - val_loss: 165.0062 - val_mae: 6.2835 Epoch 3/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 180.0067 - mae: 6.9049 - val_loss: 154.5481 - val_mae: 7.8355 Epoch 4/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 145.4411 - mae: 6.1036 - val_loss: 114.0388 - val_mae: 4.5863 Epoch 5/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 124.6900 - mae: 5.5366 - val_loss: 87.4020 - val_mae: 3.9355 Epoch 6/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - loss: 103.0333 - mae: 5.1290 - val_loss: 86.1112 - val_mae: 4.8812 Epoch 7/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - loss: 101.6065 - mae: 4.9477 - val_loss: 66.2670 - val_mae: 3.4518 Epoch 8/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 91.5161 - mae: 4.7847 - val_loss: 68.5574 - val_mae: 3.7937 Epoch 9/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 3ms/step - loss: 92.4686 - mae: 4.6195 - val_loss: 55.3026 - val_mae: 4.2518 Epoch 10/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - loss: 83.9193 - mae: 4.4766 - val_loss: 93.0204 - val_mae: 5.1037 Epoch 11/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - loss: 73.8946 - mae: 4.3010 - val_loss: 43.4825 - val_mae: 3.1535 Epoch 12/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 3ms/step - loss: 75.3073 - mae: 4.2707 - val_loss: 33.1330 - val_mae: 2.5636 Epoch 13/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - loss: 68.0840 - mae: 4.0985 - val_loss: 42.8328 - val_mae: 3.3675 Epoch 14/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - loss: 64.2629 - mae: 4.0251 - val_loss: 49.7365 - val_mae: 3.4961 Epoch 15/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 3ms/step - loss: 71.6059 - mae: 3.9543 - val_loss: 33.0844 - val_mae: 3.3712 Epoch 16/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - loss: 55.7071 - mae: 3.8210 - val_loss: 27.5599 - val_mae: 2.5060 Epoch 17/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 58.9544 - mae: 3.7550 - val_loss: 33.5092 - val_mae: 3.0504 Epoch 18/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 49.7374 - mae: 3.6048 - val_loss: 18.8979 - val_mae: 1.8980 Epoch 19/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 53.1799 - mae: 3.5613 - val_loss: 19.2844 - val_mae: 1.9462 Epoch 20/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 50.5979 - mae: 3.4279 - val_loss: 31.2365 - val_mae: 2.2756 Epoch 21/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 48.3958 - mae: 3.4575 - val_loss: 20.1160 - val_mae: 2.1782 Epoch 22/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 51.4337 - mae: 3.4064 - val_loss: 20.3351 - val_mae: 2.4709 Epoch 23/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 48.2273 - mae: 3.3874 - val_loss: 16.4264 - val_mae: 1.9576 Epoch 24/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 42.6366 - mae: 3.2962 - val_loss: 13.0901 - val_mae: 1.9384 Epoch 25/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 45.1562 - mae: 3.2402 - val_loss: 15.1436 - val_mae: 1.9503 Epoch 26/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 45.7242 - mae: 3.2364 - val_loss: 13.3391 - val_mae: 1.7806 Epoch 27/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 53.2993 - mae: 3.2616 - val_loss: 31.7279 - val_mae: 3.2128 Epoch 28/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 3ms/step - loss: 43.9514 - mae: 3.1336 - val_loss: 13.9642 - val_mae: 1.7350 Epoch 29/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 10s 2ms/step - loss: 43.7534 - mae: 3.1034 - val_loss: 14.1895 - val_mae: 1.8326 Epoch 30/30 1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 42.4841 - mae: 3.0597 - val_loss: 13.0012 - val_mae: 1.6620 611/611 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step MSE:12.771035268984182
In [227]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
Mean Squared Error: 12.771035268984182 R-squared: 0.9985969164383067
In [11]:
print(f"RMSE (LSTM): {sqrt(12.771035268984182)}")
RMSE (LSTM): 3.5736585271936914
In [231]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
def create_model():
model = Sequential([
LSTM(96, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True),
Dropout(0.1),
LSTM(64, activation='relu'),
Dropout(0.1),
Dense(16, activation='relu'),
Dense(1)
])
model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])
return model
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_mse = []
X = X_train
y = y_train
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
print(f"Training fold {fold + 1}...")
X_fold_train, X_fold_val = X[train_idx], X[val_idx]
y_fold_train, y_fold_val = y.iloc[train_idx], y.iloc[val_idx]
model = create_model()
model = tuner.hypermodel.build(best_hps)
model.fit(X_fold_train, y_fold_train, epochs=20, batch_size=32, verbose=0)
y_pred = model.predict(X_fold_val)
mse = mean_squared_error(y_fold_val, y_pred)
fold_mse.append(mse)
print(f"Fold {fold + 1} MSE: {mse}")
print(f"Average Cross-Validation MSE: {np.mean(fold_mse)}")
Training fold 1...
C:\Users\chaoh\anaconda3\Lib\site-packages\keras\src\layers\rnn\rnn.py:204: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
489/489 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step Fold 1 MSE: 32.09960144679565 Training fold 2...
C:\Users\chaoh\anaconda3\Lib\site-packages\keras\src\layers\rnn\rnn.py:204: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
489/489 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step Fold 2 MSE: 14.963886774129815 Training fold 3...
C:\Users\chaoh\anaconda3\Lib\site-packages\keras\src\layers\rnn\rnn.py:204: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
489/489 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step Fold 3 MSE: 60.439355601183664 Training fold 4...
C:\Users\chaoh\anaconda3\Lib\site-packages\keras\src\layers\rnn\rnn.py:204: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
489/489 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step Fold 4 MSE: 19.8639216633464 Training fold 5...
C:\Users\chaoh\anaconda3\Lib\site-packages\keras\src\layers\rnn\rnn.py:204: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
489/489 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step Fold 5 MSE: 22.022435886051785 Average Cross-Validation MSE: 29.877840274301462
In [233]:
y_pred_rescaled = y_pred.flatten()
num_test_rows = int(0.2 * len(df7))
y_pred_rescaled_trimmed = y_pred_rescaled[-num_test_rows:]
if len(y_pred_rescaled_trimmed) != num_test_rows:
raise ValueError(f"Trimmed predicted values' length ({len(y_pred_rescaled_trimmed)}) does not match the expected number of rows ({num_test_rows})")
target_column_index = df7.columns.get_loc(target_column)
df7.iloc[-num_test_rows:, target_column_index] = y_pred_rescaled_trimmed
In [235]:
df5_grouped = df5.groupby('Date')[target_column].mean()
df7_grouped = df7.groupby('Date')[target_column].mean()
plt.figure(figsize=(12, 6))
plt.plot(df5_grouped.index, df5_grouped.values, label='Original Data (df5)', color='blue', linewidth=2)
plt.plot(df7_grouped.index, df7_grouped.values, label='Modified Data (df7)', color='red', linestyle='--', linewidth=2)
plt.xlabel('Date')
plt.ylabel('COVID-19 Deaths')
plt.title('COVID-19 Deaths Comparison: Grouped by Date')
plt.legend()
plt.grid(True)
plt.show()
In [ ]: